set more off

use assignee, clear

keep name engname kiprisid address
duplicates drop
save KIPRIS_duplicates_dropped, replace

use KIPRIS_duplicates_dropped, clear
ren address location

*주소 정리
replace location = subinstr(location," 특별시","특별시",30)
replace location = subinstr(location," 광역시","광역시",30)


gen loc1 = word(location,1) 
gen loc2 = word(location,2)

ren location temp 
ren loc1 location

replace location = subinword(location,"서울시","서울특별시",30)
replace location = subinword(location,"서울","서울특별시",30)
replace location = subinword(location,"부산시","부산광역시",30)
replace location = subinword(location,"부산","부산광역시",30)
replace location = subinword(location,"인천시","인천광역시",30)
replace location = subinword(location,"인천","인천광역시",30)
replace location = subinword(location,"대구시","대구광역시",30)
replace location = subinword(location,"대구","대구광역시",30)
replace location = subinword(location,"대전시","대전광역시",30)
replace location = subinword(location,"대전","대전광역시",30)
replace location = subinword(location,"광주시","광주광역시",30)
replace location = subinword(location,"광주","광주광역시",30)
replace location = subinword(location,"울산시","울산광역시",30)
replace location = subinword(location,"울산","울산광역시",30)
replace location = subinword(location,"경기","경기도",30)
replace location = subinword(location,"강원","강원도",30)
replace location = subinword(location,"충남","충청남도",30)
replace location = subinword(location,"충북","충청북도",30)
replace location = subinword(location,"전남","전라남도",30)
replace location = subinword(location,"전북","전라북도",30)
replace location = subinword(location,"경남","경상남도",30)
replace location = subinword(location,"경북","경상북도",30)
replace location = subinword(location,"제주도","제주특별자치도",30)
replace location = subinword(location,"제주","제주특별자치도",30)
replace location = subinword(location,"세종시","세종특별자치시",30)
replace location = subinword(location,"세종","세종특별자치시",30)

replace location = subinstr(location, `"""',  "", 30)

ren location loc1
ren temp location

*한국이름 정리
gen Name = name

replace Name = regexr(Name, "\(.*\)", "")

replace Name=upper(Name)
replace Name = subinstr(Name,"주식회사","",30)

replace Name = subinstr(Name,"(주)","",30)
replace Name = subinstr(Name,"(주","",30)
replace Name = subinstr(Name,"주)","",30)
replace Name = subinstr(Name,"[주]","",30)
replace Name = subinstr(Name,"주]","",30)
replace Name = subinstr(Name,"[주","",30)

replace Name = subinstr(Name,"(유)","",30)
replace Name = subinstr(Name,"(유","",30)
replace Name = subinstr(Name,"유)","",30)
replace Name = subinstr(Name,"[유]","",30)
replace Name = subinstr(Name,"유]","",30)
replace Name = subinstr(Name,"[유","",30)

replace Name = subinstr(Name,"(재)","",30)
replace Name = subinstr(Name,"(재","",30)
replace Name = subinstr(Name,"재)","",30)
replace Name = subinstr(Name,"[재]","",30)
replace Name = subinstr(Name,"재]","",30)
replace Name = subinstr(Name,"[재","",30)

replace Name = subinstr(Name,"(합)","",30)
replace Name = subinstr(Name,"(합","",30)
replace Name = subinstr(Name,"합)","",30)
replace Name = subinstr(Name,"[합]","",30)
replace Name = subinstr(Name,"합]","",30)
replace Name = subinstr(Name,"[합","",30)

replace Name = subinstr(Name, "'",  "", 30)
replace Name = subinstr(Name, `"""',  "", 30)
replace Name = subinstr(Name, "",  "", 30)
replace Name = subinstr(Name, ";",  "", 30)
replace Name = subinstr(Name, "^",  "", 30)
replace Name = subinstr(Name, "<",  "", 30)
replace Name = subinstr(Name, ".",  "", 30)
replace Name = subinstr(Name, "`",  "", 30)
replace Name = subinstr(Name, "_",  "", 30)
replace Name = subinstr(Name, ">",  "", 30)
replace Name = subinstr(Name, "''", "", 30)
replace Name = subinstr(Name, "!",  "", 30)
replace Name = subinstr(Name, "+",  "", 30)
replace Name = subinstr(Name, "?",  "", 30)
replace Name = subinstr(Name, "(",  "", 30) 
replace Name = subinstr(Name, "â",  "", 30)
replace Name = subinstr(Name, "{",  "", 30)
replace Name = subinstr(Name, "\",  "", 30)
replace Name = subinstr(Name, ")",  "", 30) 
replace Name = subinstr(Name, "$",  "", 30)
replace Name = subinstr(Name, "}",  "", 30)
replace Name = subinstr(Name, "|",  "", 30)
replace Name = subinstr(Name, ",",  "", 30)
replace Name = subinstr(Name, "%",  "", 30)
replace Name = subinstr(Name, "[",  "", 30)
replace Name = subinstr(Name, "Ƣ",  "", 30)
replace Name = subinstr(Name, "*",  "", 30)
replace Name = subinstr(Name, "]",  "", 30)
replace Name = subinstr(Name, "/",  "", 30) 
replace Name = subinstr(Name, "@",  "", 30)
replace Name = subinstr(Name, ":",  "", 30)
replace Name = subinstr(Name, "~",  "", 30)
replace Name = subinstr(Name, "#",  "", 30)
replace Name = subinstr(Name, "-",  "", 30)
replace Name = subinstr(Name, " ", "", 30)

format loc* %15s
format *name location Name %40s
format kiprisid %15.0g

gen assg_type = int(kiprisid/1e11)

sa KIPRIS_cleaned, replace

******************************************

use Corp_KIPRIS, clear 
/* Corp_KIPRIS is the result from "matching corp_num to KIPRIS ID.py" 
where .csv is reformatted into .dta file" */

so Corp
qui by Corp: gen dup = cond(_N==1,1,_n)
reshape wide Name EngName, i(CorpNum kiprisid) j(dup)

sa kipris_dict, replace 

******************************************

use kipris_dict, clear

merge 1:1 CorpNum using dg_dict.dta
keep if _m == 3
drop _m

sa dg_kip_corpnum, replace

use dg_kip_corpnum, clear

keep CorpNum kiprisid
destring kiprisid, ignore(-) replace

merge 1:n kiprisid using KIPRIS_cleaned.dta

sa match1, replace


use match1, clear

preserve
keep if _m == 3
drop _m

so Corp
keep Corp kipris engname Name assg_type
format CorpNum kiprisid %15.0g
qui by Corp: gen dup = cond(_N==1,1,_n)
reshape wide Name engname, i(CorpNum kiprisid) j(dup)

sa corpnum_matched, replace
*sa dup_corpnum_matched, replace

restore

keep if _m == 2
drop _m

keep kiprisid Name engname loc1 loc2 assg_type
sa corpnum_unmatched, replace

use corpnum_matched.dta, clear
keep Corp kipris
merge 1:n CorpNum using dg_temp.dta
keep if _m == 2
keep Symbol Name EngName loc1 loc2
ren EngName engname

sa dg_corpnum_unmatched, replace
